require(Hmisc)
knitrSet(lang='markdown')
require(rms)
getHdata(FEV)
contents(FEV)
Data frame:FEV 654 observations and 6 variables Maximum # NAs:0
Units Levels Storage
id integer
age years integer
fev liters double
height inches double
sex 2 integer
smoke 2 integer
+--------+---------------------------------+
|Variable|Levels |
+--------+---------------------------------+
| sex |female,male |
+--------+---------------------------------+
| smoke |non-current smoker,current smoker|
+--------+---------------------------------+
describe(FEV)
FEV
6 Variables 654 Observations
---------------------------------------------------------------------------
id
n missing unique Info Mean .05 .10 .25 .50
654 0 654 1 37170 3142 6162 15811 36071
.75 .90 .95
53638 73342 77706
lowest : 201 202 301 341 351
highest: 83841 83901 83951 83952 90001
---------------------------------------------------------------------------
age [years]
n missing unique Info Mean .05 .10 .25 .50
654 0 17 0.988 9.931 5 6 8 10
.75 .90 .95
12 14 15
lowest : 3 4 5 6 7, highest: 15 16 17 18 19
Value 3 4 5 6 7 8 9 10 11 12
Frequency 2 9 28 37 54 85 94 81 90 57
Proportion 0.003 0.014 0.043 0.057 0.083 0.130 0.144 0.124 0.138 0.087
Value 13 14 15 16 17 18 19
Frequency 43 25 19 13 8 6 3
Proportion 0.066 0.038 0.029 0.020 0.012 0.009 0.005
---------------------------------------------------------------------------
fev [liters]
n missing unique Info Mean .05 .10 .25 .50
654 0 575 1 2.637 1.445 1.612 1.981 2.547
.75 .90 .95
3.119 3.813 4.289
lowest : 0.791 0.796 0.839 1.004 1.072
highest: 5.102 5.224 5.633 5.638 5.793
---------------------------------------------------------------------------
height [inches]
n missing unique Info Mean .05 .10 .25 .50
654 0 56 0.999 61.14 51.0 53.0 57.0 61.5
.75 .90 .95
65.5 68.5 70.0
lowest : 46.0 46.5 47.0 48.0 49.0, highest: 72.0 72.5 73.0 73.5 74.0
---------------------------------------------------------------------------
sex
n missing unique
654 0 2
female (318, 0.486), male (336, 0.514)
---------------------------------------------------------------------------
smoke
n missing unique
654 0 2
non-current smoker (589, 0.901), current smoker (65, 0.099)
---------------------------------------------------------------------------
ggplot(FEV, aes(x=age, y=fev, color=height)) + geom_point() + facet_grid(smoke ~ sex)
dd <- datadist(FEV); options(datadist='dd')
dd
id age fev height sex smoke
Low:effect 15811.0000 8 1.981000 57.0 <NA> <NA>
Adjust to 36071.0000 10 2.547500 61.5 male non-current smoker
High:effect 53638.5000 12 3.118500 65.5 <NA> <NA>
Low:prediction 600.2355 4 1.252128 49.0 female non-current smoker
High:prediction 81741.1529 17 4.789642 72.0 male current smoker
Low 201.0000 3 0.791000 46.0 female non-current smoker
High 90001.0000 19 5.793000 74.0 male current smoker
Values:
sex : female male
smoke : non-current smoker current smoker
FEV is log-transformed to satisfy normality and equal variance assumptions.
f <- ols(log(fev) ~ age, data=FEV)
f
Linear Regression Model
ols(formula = log(fev) ~ age, data = FEV)
Model Likelihood Discrimination
Ratio Test Indexes
Obs 654 LR chi2 592.40 R2 0.596
sigma 0.2120 d.f. 1 R2 adj 0.595
d.f. 652 Pr(> chi2) 0.0000 g 0.287
Residuals
Min 1Q Median 3Q Max
-0.72047 -0.13752 0.00302 0.14681 0.60267
Coef S.E. t Pr(>|t|)
Intercept 0.0506 0.0291 1.74 0.0826
age 0.0871 0.0028 31.00 <0.0001
r <- as.numeric(resid(f))
with(FEV, plot(age, r))
qqnorm(r)
qqline(r)
# Show algebraic form of fitted model
Function(f)
function (age = 10)
{
0.050596002 + 0.087083296 * age
}
<environment: 0x88b0db8>
g <- Function(f)
exp(g()); exp(g(age=10)) # Evaluate the fitted model, antilog to get original scale
[1] 2.512879
[1] 2.512879
ggplot(Predict(f)) + geom_point(aes(x=age, y=log(fev), color=sex), FEV)
The knots are at these quantiles of age: 0.1, 0.5, 0.9.
f <- ols(log(fev) ~ rcs(age, 3), data=FEV, x=TRUE)
f
Linear Regression Model
ols(formula = log(fev) ~ rcs(age, 3), data = FEV, x = TRUE)
Model Likelihood Discrimination
Ratio Test Indexes
Obs 654 LR chi2 668.33 R2 0.640
sigma 0.2002 d.f. 2 R2 adj 0.639
d.f. 651 Pr(> chi2) 0.0000 g 0.297
Residuals
Min 1Q Median 3Q Max
-0.608234 -0.134592 0.006764 0.139165 0.538798
Coef S.E. t Pr(>|t|)
Intercept -0.3377 0.0513 -6.58 <0.0001
age 0.1386 0.0063 21.88 <0.0001
age' -0.0627 0.0070 -8.95 <0.0001
g <- Function(f)
g(10:20)
[1] 0.9852999 1.0660775 1.1292243 1.1806172 1.2261332 1.2706697 1.3152062
[8] 1.3597427 1.4042792 1.4488157 1.4933522
g()
[1] 0.9852999
g
function (age = 10)
{
-0.33768719 + 0.13856744 * age - 0.00097948891 * pmax(age -
6, 0)^3 + 0.0019589778 * pmax(age - 10, 0)^3 - 0.00097948891 *
pmax(age - 14, 0)^3
}
<environment: 0x9aaf7c0>
ggplot(Predict(f))
anova(f)
Analysis of Variance Response: log(fev)
Factor d.f. Partial SS MS F P
age 2 46.42323 23.21161744 578.90 <.0001
Nonlinear 1 3.21318 3.21318002 80.14 <.0001
REGRESSION 2 46.42323 23.21161744 578.90 <.0001
ERROR 651 26.10268 0.04009628
f <- ols(log(fev) ~ rcs(age, 5), data=FEV)
f
Linear Regression Model
ols(formula = log(fev) ~ rcs(age, 5), data = FEV)
Model Likelihood Discrimination
Ratio Test Indexes
Obs 654 LR chi2 678.99 R2 0.646
sigma 0.1989 d.f. 4 R2 adj 0.644
d.f. 649 Pr(> chi2) 0.0000 g 0.303
Residuals
Min 1Q Median 3Q Max
-0.621716 -0.135183 0.008372 0.144535 0.543184
Coef S.E. t Pr(>|t|)
Intercept -0.1758 0.0985 -1.78 0.0748
age 0.1126 0.0159 7.10 <0.0001
age' 0.0383 0.0817 0.47 0.6391
age'' -0.2167 0.4077 -0.53 0.5951
age''' -0.1496 0.9059 -0.17 0.8689
Function(f)
function (age = 10)
{
-0.17580202 + 0.11261215 * age + 0.0003834753 * pmax(age -
5, 0)^3 - 0.0021674701 * pmax(age - 8, 0)^3 - 0.0014960677 *
pmax(age - 10, 0)^3 + 0.0047044691 * pmax(age - 11, 0)^3 -
0.0014244066 * pmax(age - 15, 0)^3
}
<environment: 0x86abf50>
anova(f)
Analysis of Variance Response: log(fev)
Factor d.f. Partial SS MS F P
age 4 46.845368 11.71134190 295.97 <.0001
Nonlinear 3 3.635313 1.21177091 30.62 <.0001
REGRESSION 4 46.845368 11.71134190 295.97 <.0001
ERROR 649 25.680547 0.03956941
ggplot(Predict(f))
f <- ols(log(fev) ~ rcs(age, 5) + sex, data=FEV)
Function(f)
function (age = 10, sex = "male")
{
-0.23344699 + 0.11429274 * age + 0.00019963206 * pmax(age -
5, 0)^3 - 0.0010413455 * pmax(age - 8, 0)^3 - 0.0043083413 *
pmax(age - 10, 0)^3 + 0.0067087011 * pmax(age - 11, 0)^3 -
0.0015586464 * pmax(age - 15, 0)^3 + 0.099501651 * (sex ==
"male")
}
<environment: 0xc00d218>
anova(f)
Analysis of Variance Response: log(fev)
Factor d.f. Partial SS MS F P
age 4 46.370003 11.5925008 312.11 <.0001
Nonlinear 3 3.675485 1.2251616 32.99 <.0001
sex 1 1.612016 1.6120158 43.40 <.0001
REGRESSION 5 48.457383 9.6914767 260.92 <.0001
ERROR 648 24.068532 0.0371428
ggplot(Predict(f, age, sex)) + geom_point(aes(x=age, y=log(fev), color=sex), FEV)
The following model allows for different shapes of effects for males and females.
f <- ols(log(fev) ~ rcs(age, 5) * sex, data=FEV)
r <- as.numeric(resid(f))
plot(fitted(f), r)
qqnorm(r); qqline(r)
Function(f)
function (age = 10, sex = "male")
{
-0.33458936 + 0.13366998 * age - 2.9172721e-05 * pmax(age -
5, 0)^3 - 0.0039056026 * pmax(age - 8, 0)^3 + 0.0078871147 *
pmax(age - 10, 0)^3 - 0.002951157 * pmax(age - 11, 0)^3 -
0.0010011823 * pmax(age - 15, 0)^3 + 0.34502774 * (sex ==
"male") + (sex == "male") * (-0.046273816 * age + 0.00093632418 *
pmax(age - 5, 0)^3 + 0.0034513932 * pmax(age - 8, 0)^3 -
0.020568634 * pmax(age - 10, 0)^3 + 0.017330044 * pmax(age -
11, 0)^3 - 0.0011491273 * pmax(age - 15, 0)^3)
}
<environment: 0x73905b8>
anova(f)
Analysis of Variance Response: log(fev)
Factor d.f. Partial SS MS
age (Factor+Higher Order Factors) 8 48.3058462 6.03823078
All Interactions 4 1.9358432 0.48396079
Nonlinear (Factor+Higher Order Factors) 6 4.7393834 0.78989723
sex (Factor+Higher Order Factors) 5 3.5478589 0.70957178
All Interactions 4 1.9358432 0.48396079
age * sex (Factor+Higher Order Factors) 4 1.9358432 0.48396079
Nonlinear 3 0.8210779 0.27369262
Nonlinear Interaction : f(A,B) vs. AB 3 0.8210779 0.27369262
TOTAL NONLINEAR 6 4.7393834 0.78989723
TOTAL NONLINEAR + INTERACTION 7 5.6113278 0.80161826
REGRESSION 9 50.3932265 5.59924739
ERROR 644 22.1326884 0.03436753
F P
175.70 <.0001
14.08 <.0001
22.98 <.0001
20.65 <.0001
14.08 <.0001
14.08 <.0001
7.96 <.0001
7.96 <.0001
22.98 <.0001
23.32 <.0001
162.92 <.0001
ggplot(Predict(f, age, sex))
Plot predicted median FEV by anti-logging predicted values.
ggplot(Predict(f, age, sex, fun=exp), ylab='Estimated Median FEV')
We show the joint age and height effects using a color image plot. A silly x-axis label is used to show LaTeX math mode.
f <- ols(log(fev) ~ rcs(age, 5) * sex + rcs(height, 5), data=FEV)
anova(f)
Analysis of Variance Response: log(fev)
Factor d.f. Partial SS MS
age (Factor+Higher Order Factors) 8 1.3289091 0.16611364
All Interactions 4 0.3636977 0.09092444
Nonlinear (Factor+Higher Order Factors) 6 0.4358008 0.07263346
sex (Factor+Higher Order Factors) 5 0.5969813 0.11939627
All Interactions 4 0.3636977 0.09092444
height 4 8.8829270 2.22073175
Nonlinear 3 0.1721358 0.05737861
age * sex (Factor+Higher Order Factors) 4 0.3636977 0.09092444
Nonlinear 3 0.3333276 0.11110920
Nonlinear Interaction : f(A,B) vs. AB 3 0.3333276 0.11110920
TOTAL NONLINEAR 9 0.5804472 0.06449413
TOTAL NONLINEAR + INTERACTION 10 0.5865061 0.05865061
REGRESSION 13 59.2761535 4.55970412
ERROR 640 13.2497614 0.02070275
F P
8.02 <.0001
4.39 0.0016
3.51 0.0020
5.77 <.0001
4.39 0.0016
107.27 <.0001
2.77 0.0408
4.39 0.0016
5.37 0.0012
5.37 0.0012
3.12 0.0011
2.83 0.0019
220.25 <.0001
require(plotly)
ggplotly(ggplot(Predict(f, age, sex), xlab='$\\chi^{2}$',
adj.subtitle=FALSE))
p <- Predict(f, age, height)
bplot(p)
p <- with(p, list(age=unique(age), height=unique(height),
Yhat=matrix(yhat, ncol=200)))
plot_ly(p, x=age, y=height, z=Yhat,
type='heatmap')
plot_ly(p, x=age, y=height, z=Yhat, type='surface')
By plotting spike histograms showing the smoking-specific age distribution we see that there are almost no very young children who have smoked. This limits the power to detect an age by smoking interaction.
f <- ols(log(fev) ~ rcs(age, 5) + smoke, data=FEV)
ggplotly(ggplot(Predict(f, age, smoke), rdata=FEV))
R version 3.3.1 (2016-06-21)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04 LTS
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] plotly_3.6.0 rms_4.5-1 SparseM_1.7 Hmisc_3.18-0
[5] ggplot2_2.1.0 Formula_1.2-1 survival_2.39-5 lattice_0.20-33
R Core Team (2016). R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria.